#
knitr::opts_chunk$set(echo = TRUE, comment = NA)
library(DT)
library(xtable)
library(dplyr)
library(ggplot2)
library(plotly)
#convert date to a Date variable
mydata <- data.frame(read.csv("activity.csv", stringsAsFactors = FALSE))
mydata$date <- as.Date(mydata$date)
This assignment used data collected each day, at five minute intervals over a 2 month period with a personal activity monitoring device.
Here, missing values in the steps column were excluded. The rest of the data was grouped by date and the daily mean, total number of steps and, median values for each day were returned These values are shown below as column values of summarized data for each day. The total number of observations are 17568
# Remove all observations with missing values
mydata <- na.omit(mydata)
#
DataSummary <- mydata %>%
group_by(date) %>%
summarize( MeanSteps = round(mean(steps), 2),
SumSteps = sum(steps),
MedianSteps = median(steps))
knitr::kable(head(DataSummary, 10), caption = ("The mean, total number of steps and median steps per day"))
| date | MeanSteps | SumSteps | MedianSteps |
|---|---|---|---|
| 2012-10-02 | 0.44 | 126 | 0 |
| 2012-10-03 | 39.42 | 11352 | 0 |
| 2012-10-04 | 42.07 | 12116 | 0 |
| 2012-10-05 | 46.16 | 13294 | 0 |
| 2012-10-06 | 53.54 | 15420 | 0 |
| 2012-10-07 | 38.25 | 11015 | 0 |
| 2012-10-09 | 44.48 | 12811 | 0 |
| 2012-10-10 | 34.38 | 9900 | 0 |
| 2012-10-11 | 35.78 | 10304 | 0 |
| 2012-10-12 | 60.35 | 17382 | 0 |
gplot1 <- ggplot(data=DataSummary, aes(SumSteps)) +
geom_histogram(alpha = .5, col="blue",
aes(fill=..count..)) +
labs(title="Histogram of Total Number of Steps Per Day",
x = "Total number of steps taken each day")
return(ggplotly(gplot1))
Below, is a time series plot of daily activities with a vertical line was drawn at five minute interval with maximum number of steps.
#
maxint <- max(mydata$steps)
gplot2 <- ggplot(data=mydata, aes(x=interval, y=steps)) +
geom_point() +
geom_vline(xintercept = maxint, linetype="dotted",
color = "red", size=1.5) +
labs(title="Time series plot of steps vs intervals",
x = "Intervals")
return(ggplotly(gplot2))
#Load data again and count the number of observations with missing values
mydataM <- data.frame(read.csv("activity.csv", stringsAsFactors = FALSE))
mydataM$date <- as.Date(mydataM$date)
missingData <- sum(is.na(mydataM))
The mising values are only in the steps taken column. We will replace each missing value with the mean value of that column and verify that new dataset has zero missing values afterwards.
mydataM$steps[is.na(mydataM$steps)] <- mean(mydataM$steps, na.rm=TRUE)
missingDataM <- sum(is.na(mydataM))
The total number of observations with missing values replaced is 17568
DataSum <- mydataM %>%
group_by(date) %>%
summarize(M_Steps = round(mean(steps), 2),
S_Steps = round(sum(steps), 2),
MedSteps = round(median(steps), 2))
knitr::kable(head(DataSum, 10), caption = ("The Mean, Total and Median steps per day"))
| date | M_Steps | S_Steps | MedSteps |
|---|---|---|---|
| 2012-10-01 | 37.38 | 10766.19 | 37.38 |
| 2012-10-02 | 0.44 | 126.00 | 0.00 |
| 2012-10-03 | 39.42 | 11352.00 | 0.00 |
| 2012-10-04 | 42.07 | 12116.00 | 0.00 |
| 2012-10-05 | 46.16 | 13294.00 | 0.00 |
| 2012-10-06 | 53.54 | 15420.00 | 0.00 |
| 2012-10-07 | 38.25 | 11015.00 | 0.00 |
| 2012-10-08 | 37.38 | 10766.19 | 37.38 |
| 2012-10-09 | 44.48 | 12811.00 | 0.00 |
| 2012-10-10 | 34.38 | 9900.00 | 0.00 |
#
gplot3 <- ggplot(data=DataSum, aes(M_Steps)) +
geom_histogram(alpha = .5, col="blue",
aes(fill=..count..)) +
labs(title="Histogram of the total steps without missing values imputed",
x = "Total number of steps taken each day")
return(ggplotly(gplot3))
The results show that replacing all 2304 missing values with mean values has the effect of increasing overall values (overall mean, total number of steps and, median) compared to previous values with missing observation removed.
mydataM$date <- as.Date(mydataM$date)
#create a factor variable 'WDays' of weekdays and weekends
WeekDays <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
mydataM$WDays <- factor((weekdays(mydataM$date) %in% WeekDays),
levels=c(FALSE, TRUE), labels=c('Weekend', 'Weekday') )
#
table(mydataM$WDays)
Weekend Weekday
4608 12960
ggplot(data = mydataM) +
geom_point( mapping = aes(x=interval, y = steps) ) +
facet_wrap(~WDays)
WeekEnd <- mydataM %>%
filter(WDays == 'Weekend')
#
ggplot(data = WeekEnd) +
geom_line( mapping = aes(x=interval, y = steps) ) +
labs(title="WeekEnd")
WeekDay <- mydataM %>%
group_by(date, interval, WDays) %>%
filter(WDays == 'Weekday')
#
ggplot(data = WeekDay) +
geom_line( mapping = aes(x=interval, y = steps) ) +
labs(title="WeekDay")